In [1]:
#importing libraries
import pandas as pd
import numpy as np
import plotly
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
In [2]:
data=pd.read_csv("insurance.csv")
data.head()
Out[2]:
age sex bmi children smoker region expenses
0 19 female 27.9 0 yes southwest 16884.92
1 18 male 33.8 1 no southeast 1725.55
2 28 male 33.0 3 no southeast 4449.46
3 33 male 22.7 0 no northwest 21984.47
4 32 male 28.9 0 no northwest 3866.86
In [3]:
data.shape   #dimenssion of our dataset
Out[3]:
(1338, 7)
In [4]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   expenses  1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB
In [5]:
data.describe(include=np.number).transpose()    #simple summary report
Out[5]:
count mean std min 25% 50% 75% max
age 1338.0 39.207025 14.049960 18.00 27.0000 39.00 51.000 64.00
bmi 1338.0 30.665471 6.098382 16.00 26.3000 30.40 34.700 53.10
children 1338.0 1.094918 1.205493 0.00 0.0000 1.00 2.000 5.00
expenses 1338.0 13270.422414 12110.011240 1121.87 4740.2875 9382.03 16639.915 63770.43
In [6]:
#frequency of insurence for different age
fig = px.histogram(data['age'])
fig.show()
In [7]:
#Expenses due to different age
age_group_max=data.groupby('age').max()['expenses']
age_group_min=data.groupby('age').min()['expenses']
age_group_mean=data.groupby('age').mean()['expenses']
age_group=data.groupby('age').sum()['expenses']
fig = go.Figure()
fig.add_trace(go.Scatter(x=age_group.index,y=age_group.values,mode='lines+markers',name='Total expenses'))
fig.add_trace(go.Scatter(x=age_group_max.index,y=age_group_max.values,mode='lines+markers',name='Max expenses'))
fig.add_trace(go.Scatter(x=age_group_min.index,y=age_group_min.values,mode='lines+markers',name='Min expenses'))
fig.add_trace(go.Scatter(x=age_group_mean.index,y=age_group_mean.values,mode='lines+markers',name='Mean expenses'))
fig.update_layout(title='Expenses due to different age',
                   xaxis_title='Age',
                   yaxis_title='Expenses')
fig.show()
In [8]:
#Expenses for various bmi index
bmi_group=data.groupby('bmi').mean()['expenses']
fig = go.Figure()
fig.add_trace(go.Scatter(x=bmi_group.index,y=bmi_group.values,stackgroup = 'one'))
fig.update_layout(title='Expenses for various bmi index',xaxis_title='bmi',yaxis_title='Expenses')
fig.show()
In [9]:
#expenses changes for different childeren no
child_group=data.groupby('children').sum()['expenses']
fig = px.bar(child_group)
fig.update_layout(title='Expenses for different childeren no',yaxis_title='Expenses')
fig.show()
In [10]:
smoker_group=data.groupby('smoker').sum()['expenses']
fig = px.pie(values=smoker_group.values,names=smoker_group.index, title='pertcentage of smoker and non-smoker expenses')
fig.show()   #Yes: smoker ,No: non-smoker
In [11]:
#expenses changes for different childeren no
region_group=data.groupby('region').sum()['expenses']
colors = ['gold', 'mediumturquoise', 'darkorange', 'lightgreen']

fig = go.Figure(data=[go.Pie(labels=region_group.index,
                             values=region_group.values,hole=.5)])
fig.update_traces(hoverinfo='value+percent', textinfo='label', textfont_size=20,
                  marker=dict(colors=colors, line=dict(color='#000000', width=2)))
fig.show()
In [12]:
ax=data['expenses'].plot.hist(density=True,bins=range(1,12))
data["expenses"].plot.density(ax=ax)
C:\Users\SVMY\anaconda3\lib\site-packages\numpy\lib\histograms.py:905: RuntimeWarning:

invalid value encountered in true_divide

Out[12]:
<AxesSubplot:ylabel='Density'>
In [13]:
ax = sns.heatmap(data.corr(), annot=True)
In [14]:
fig = px.violin(data, y="expenses", color="sex",points='all',box=True,
                violinmode='overlay', # draw violins on top of each other
                # default violinmode is 'group' as in example above
                hover_data=data.columns)

fig.update_layout(yaxis_zeroline=False)
fig.show()
In [15]:
#remove outlier from expenses
q_low = data["expenses"].quantile(0.01)
q_hi  = data["expenses"].quantile(0.99)

data = data[(data["expenses"] < q_hi) & (data["expenses"] > q_low)]
In [16]:
data.shape
Out[16]:
(1310, 7)
In [17]:
df = pd.get_dummies(data, columns=['sex','smoker',"region"])
df 
Out[17]:
age bmi children expenses sex_female sex_male smoker_no smoker_yes region_northeast region_northwest region_southeast region_southwest
0 19 27.9 0 16884.92 1 0 0 1 0 0 0 1
1 18 33.8 1 1725.55 0 1 1 0 0 0 1 0
2 28 33.0 3 4449.46 0 1 1 0 0 0 1 0
3 33 22.7 0 21984.47 0 1 1 0 0 1 0 0
4 32 28.9 0 3866.86 0 1 1 0 0 1 0 0
... ... ... ... ... ... ... ... ... ... ... ... ...
1333 50 31.0 3 10600.55 0 1 1 0 0 1 0 0
1334 18 31.9 0 2205.98 1 0 1 0 1 0 0 0
1335 18 36.9 0 1629.83 1 0 1 0 0 0 1 0
1336 21 25.8 0 2007.95 1 0 1 0 0 0 0 1
1337 61 29.1 0 29141.36 1 0 0 1 0 1 0 0

1310 rows × 12 columns

In [18]:
x=df.drop("expenses", axis='columns')
y=df['expenses']
In [19]:
from sklearn.model_selection import train_test_split
#data slipt into 80% and 20%
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2) 
In [20]:
from sklearn import linear_model
regr = linear_model.LinearRegression()
regr.fit(x_train, y_train)
Out[20]:
LinearRegression()
In [21]:
from sklearn.metrics import confusion_matrix
y_pred=regr.predict(x_test)
from sklearn.metrics import r2_score
r=round(r2_score(y_test, y_pred),2)*100
print("accuracy of this model is :",int(r),"%")
accuracy of this model is : 75 %
In [33]:
from sklearn.metrics import mean_squared_error
from math import sqrt

rms = sqrt(mean_squared_error(y_test, y_pred))
rms
Out[33]:
13234.176301857424
In [ ]: